{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convolutional Sentiment Classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we build a *convolutional* neural net to classify IMDB movie reviews by their sentiment."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/the-deep-learners/deep-learning-illustrated/blob/master/notebooks/convolutional_sentiment_classifier.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import keras\n",
    "from keras.datasets import imdb\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Embedding\n",
    "from keras.layers import SpatialDropout1D, Conv1D, GlobalMaxPooling1D # new! \n",
    "from keras.callbacks import ModelCheckpoint\n",
    "import os\n",
    "from sklearn.metrics import roc_auc_score \n",
    "import matplotlib.pyplot as plt \n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output directory name:\n",
    "output_dir = 'model_output/conv'\n",
    "\n",
    "# training:\n",
    "epochs = 4\n",
    "batch_size = 128\n",
    "\n",
    "# vector-space embedding: \n",
    "n_dim = 64\n",
    "n_unique_words = 5000 \n",
    "max_review_length = 400\n",
    "pad_type = trunc_type = 'pre'\n",
    "drop_embed = 0.2 # new!\n",
    "\n",
    "# convolutional layer architecture:\n",
    "n_conv = 256 # filters, a.k.a. kernels\n",
    "k_conv = 3 # kernel length\n",
    "\n",
    "# dense layer architecture: \n",
    "n_dense = 256\n",
    "dropout = 0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique_words) # removed n_words_to_skip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pad_sequences(x_train, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)\n",
    "x_valid = pad_sequences(x_valid, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Design neural network architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length)) \n",
    "model.add(SpatialDropout1D(drop_embed))\n",
    "model.add(Conv1D(n_conv, k_conv, activation='relu'))\n",
    "# model.add(Conv1D(n_conv, k_conv, activation='relu'))\n",
    "model.add(GlobalMaxPooling1D())\n",
    "model.add(Dense(n_dense, activation='relu'))\n",
    "model.add(Dropout(dropout))\n",
    "model.add(Dense(1, activation='sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 400, 64)           320000    \n",
      "_________________________________________________________________\n",
      "spatial_dropout1d_1 (Spatial (None, 400, 64)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_1 (Conv1D)            (None, 398, 256)          49408     \n",
      "_________________________________________________________________\n",
      "global_max_pooling1d_1 (Glob (None, 256)               0         \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 256)               65792     \n",
      "_________________________________________________________________\n",
      "dropout_1 (Dropout)          (None, 256)               0         \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 1)                 257       \n",
      "=================================================================\n",
      "Total params: 435,457\n",
      "Trainable params: 435,457\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model.summary() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Configure model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelcheckpoint = ModelCheckpoint(filepath=output_dir+\"/weights.{epoch:02d}.hdf5\")\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/4\n",
      "25000/25000 [==============================] - 41s 2ms/step - loss: 0.4894 - acc: 0.7447 - val_loss: 0.2971 - val_acc: 0.8750\n",
      "Epoch 2/4\n",
      "25000/25000 [==============================] - 41s 2ms/step - loss: 0.2534 - acc: 0.8972 - val_loss: 0.2604 - val_acc: 0.8914\n",
      "Epoch 3/4\n",
      "25000/25000 [==============================] - 41s 2ms/step - loss: 0.1709 - acc: 0.9357 - val_loss: 0.2577 - val_acc: 0.8959\n",
      "Epoch 4/4\n",
      "25000/25000 [==============================] - 41s 2ms/step - loss: 0.1151 - acc: 0.9589 - val_loss: 0.2828 - val_acc: 0.8934\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f3ee18eee10>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_valid, y_valid), callbacks=[modelcheckpoint])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_weights(output_dir+\"/weights.03.hdf5\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_hat = model.predict_proba(x_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAD3JJREFUeJzt3X2snnV9x/H3Z1R0PiBoq3Ftt4OxOqvJImkQZ+KcNTxpKH/AUjNnNc2aOOacM9tw+6OLSqJ7YjPxYZ1lq8YJjJnRKBthPMRtGdWDOCYwQgcMOpgcbanbiA/V7/64f+ABT3uu055z35z+3q/k5L6u3/W77uv37Tncn/t6JFWFJKk/PzbpAUiSJsMAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHVqxaQHcCQrV66sqampSQ9D+lHfumv0etLLJjsOaQ633HLLN6pq1Xz9ntIBMDU1xfT09KSHIf2of3j96PWNN01yFNKckvznkH4eApKkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE49pe8EPlZTF39hItu970Nvmsh2JS2uSX2GwHg+R9wDkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKRODQqAJO9JcnuSryX5bJJnJDk1yZ4kdye5IsmJre/T2/zetnxq1vu8r7XfleSspSlJkjTEvAGQZDXwa8CGqnolcAKwGfgwcGlVrQMOAFvbKluBA1X1EuDS1o8k69t6rwDOBj6W5ITFLUeSNNTQQ0ArgB9PsgJ4JvAQ8AbgqrZ8F3B+m97U5mnLNyZJa7+8qr5TVfcCe4HTj70ESdLRmDcAquq/gD8E7mf0wX8QuAV4pKoOtW77gNVtejXwQFv3UOv//Nntc6wjSRqzIYeATmH07f1U4CeAZwHnzNG1HlvlMMsO1/7k7W1LMp1kemZmZr7hSZKO0pBDQG8E7q2qmar6HvA54GeBk9shIYA1wINteh+wFqAtfy6wf3b7HOs8rqp2VNWGqtqwatWqoyhJkjTEkAC4HzgjyTPbsfyNwB3AjcAFrc8W4Oo2vbvN05bfUFXV2je3q4ROBdYBX1qcMiRJC7Vivg5VtSfJVcBXgEPArcAO4AvA5Uk+2Np2tlV2Ap9OspfRN//N7X1uT3Ilo/A4BFxUVd9f5HokSQPNGwAAVbUd2P6k5nuY4yqeqvo2cOFh3ucS4JIFjlGStAS8E1iSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASerUoABIcnKSq5L8e5I7k7wmyfOSXJfk7vZ6SuubJB9JsjfJbUlOm/U+W1r/u5NsWaqiJEnzG7oH8KfA31fVTwM/A9wJXAxcX1XrgOvbPMA5wLr2sw34OECS5wHbgVcDpwPbHwsNSdL4zRsASU4CXgfsBKiq71bVI8AmYFfrtgs4v01vAj5VIzcDJyd5EXAWcF1V7a+qA8B1wNmLWo0kabAhewAvBmaAv0hya5JPJnkW8MKqegigvb6g9V8NPDBr/X2t7XDtkqQJGBIAK4DTgI9X1auA/+OHh3vmkjna6gjtT1w52ZZkOsn0zMzMgOFJko7GkADYB+yrqj1t/ipGgfD1dmiH9vrwrP5rZ62/BnjwCO1PUFU7qmpDVW1YtWrVQmqRJC3AvAFQVf8NPJDkZa1pI3AHsBt47EqeLcDVbXo38LZ2NdAZwMF2iOha4Mwkp7STv2e2NknSBKwY2O9dwGeSnAjcA7yDUXhcmWQrcD9wYet7DXAusBd4tPWlqvYn+QDw5dbv/VW1f1GqkCQt2KAAqKqvAhvmWLRxjr4FXHSY97kMuGwhA5QkLQ3vBJakThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwYHQJITktya5PNt/tQke5LcneSKJCe29qe3+b1t+dSs93hfa78ryVmLXYwkabiF7AG8G7hz1vyHgUurah1wANja2rcCB6rqJcClrR9J1gObgVcAZwMfS3LCsQ1fknS0BgVAkjXAm4BPtvkAbwCual12Aee36U1tnrZ8Y+u/Cbi8qr5TVfcCe4HTF6MISdLCDd0D+BPgt4AftPnnA49U1aE2vw9Y3aZXAw8AtOUHW//H2+dY53FJtiWZTjI9MzOzgFIkSQsxbwAkeTPwcFXdMrt5jq41z7IjrfPDhqodVbWhqjasWrVqvuFJko7SigF9Xgucl+Rc4BnASYz2CE5OsqJ9y18DPNj67wPWAvuSrACeC+yf1f6Y2etIksZs3j2AqnpfVa2pqilGJ3FvqKpfBG4ELmjdtgBXt+ndbZ62/Iaqqta+uV0ldCqwDvjSolUiSVqQIXsAh/PbwOVJPgjcCuxs7TuBTyfZy+ib/2aAqro9yZXAHcAh4KKq+v4xbF+SdAwWFABVdRNwU5u+hzmu4qmqbwMXHmb9S4BLFjpISdLi805gSeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpU/MGQJK1SW5McmeS25O8u7U/L8l1Se5ur6e09iT5SJK9SW5Lctqs99rS+t+dZMvSlSVJms+QPYBDwHur6uXAGcBFSdYDFwPXV9U64Po2D3AOsK79bAM+DqPAALYDrwZOB7Y/FhqSpPGbNwCq6qGq+kqb/h/gTmA1sAnY1brtAs5v05uAT9XIzcDJSV4EnAVcV1X7q+oAcB1w9qJWI0kabEHnAJJMAa8C9gAvrKqHYBQSwAtat9XAA7NW29faDtcuSZqAwQGQ5NnA3wC/XlXfOlLXOdrqCO1P3s62JNNJpmdmZoYOT5K0QIMCIMnTGH34f6aqPteav94O7dBeH27t+4C1s1ZfAzx4hPYnqKodVbWhqjasWrVqIbVIkhZgyFVAAXYCd1bVH89atBt47EqeLcDVs9rf1q4GOgM42A4RXQucmeSUdvL3zNYmSZqAFQP6vBb4JeDfkny1tf0O8CHgyiRbgfuBC9uya4Bzgb3Ao8A7AKpqf5IPAF9u/d5fVfsXpQpJ0oLNGwBV9U/MffweYOMc/Qu46DDvdRlw2UIGKElaGt4JLEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKRODXkaqCRN1NTFX5j0EI5LBsASmNQf630fetNEtitpefIQkCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnfI+AEmDeUPW8cU9AEnqlHsAx5FJfjvzLmRp+XEPQJI6ZQBIUqc8BKRF0dvJwctf/E0ANndWt44v7gFIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOjX2AEhydpK7kuxNcvG4ty9JGhlrACQ5AfgocA6wHnhLkvXjHIMkaWTcewCnA3ur6p6q+i5wObBpzGOQJDH+AFgNPDBrfl9rkySN2bgfBpc52uoJHZJtwLY2+79J7jqG7a0EvnEM6y83vdULE6r5NY9PvXncmwZ/z13Ih4+p5p8a0mncAbAPWDtrfg3w4OwOVbUD2LEYG0syXVUbFuO9loPe6gVr7oU1L41xHwL6MrAuyalJTgQ2A7vHPAZJEmPeA6iqQ0l+FbgWOAG4rKpuH+cYJEkjY/8fwlTVNcA1Y9rcohxKWkZ6qxesuRfWvARSVfP3kiQdd3wUhCR1atkHwHyPlkjy9CRXtOV7kkyNf5SLa0DNv5HkjiS3Jbk+yaBLwp7Khj5CJMkFSSrJsr9iZEjNSX6h/a5vT/JX4x7jYhvwt/2TSW5Mcmv7+z53EuNcLEkuS/Jwkq8dZnmSfKT9e9yW5LRFHUBVLdsfRieS/wN4MXAi8K/A+if1+RXgE216M3DFpMc9hpp/Hnhmm35nDzW3fs8BvgjcDGyY9LjH8HteB9wKnNLmXzDpcY+h5h3AO9v0euC+SY/7GGt+HXAa8LXDLD8X+DtG91CdAexZzO0v9z2AIY+W2ATsatNXARuTzHVD2nIxb81VdWNVPdpmb2Z0v8VyNvQRIh8Afh/49jgHt0SG1PzLwEer6gBAVT085jEutiE1F3BSm34uT7qPaLmpqi8C+4/QZRPwqRq5GTg5yYsWa/vLPQCGPFri8T5VdQg4CDx/LKNbGgt9nMZWRt8glrN5a07yKmBtVX1+nANbQkN+zy8FXprkn5PcnOTssY1uaQyp+feAtybZx+hqwneNZ2gTs6SPzxn7ZaCLbN5HSwzss5wMrifJW4ENwM8t6YiW3hFrTvJjwKXA28c1oDEY8ntewegw0OsZ7eX9Y5JXVtUjSzy2pTKk5rcAf1lVf5TkNcCnW80/WPrhTcSSfn4t9z2AeR8tMbtPkhWMdhuPtMv1VDekZpK8Efhd4Lyq+s6YxrZU5qv5OcArgZuS3MfoWOnuZX4ieOjf9tVV9b2quhe4i1EgLFdDat4KXAlQVf8CPIPRc4KOV4P+ez9ayz0AhjxaYjewpU1fANxQ7ezKMjVvze1wyJ8x+vBf7seFYZ6aq+pgVa2sqqmqmmJ03uO8qpqezHAXxZC/7b9ldMKfJCsZHRK6Z6yjXFxDar4f2AiQ5OWMAmBmrKMcr93A29rVQGcAB6vqocV682V9CKgO82iJJO8HpqtqN7CT0W7iXkbf/DdPbsTHbmDNfwA8G/jrdr77/qo6b2KDPkYDaz6uDKz5WuDMJHcA3wd+s6q+OblRH5uBNb8X+PMk72F0KOTty/kLXZLPMjqEt7Kd19gOPA2gqj7B6DzHucBe4FHgHYu6/WX8bydJOgbL/RCQJOkoGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXq/wGZUhxlVrDGKAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f3ee04a3278>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(y_hat)\n",
    "_ = plt.axvline(x=0.5, color='orange')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'96.12'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"{:0.2f}\".format(roc_auc_score(y_valid, y_hat)*100.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
